{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "from utils.utils import save_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of sample = 35097\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>change-speed</th>\n",
       "      <th>chepai-location</th>\n",
       "      <th>kilometer</th>\n",
       "      <th>price</th>\n",
       "      <th>shangpai-date</th>\n",
       "      <th>standard</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>自动</td>\n",
       "      <td>郑州</td>\n",
       "      <td>4.83万公里</td>\n",
       "      <td>20.68</td>\n",
       "      <td>2年10个月</td>\n",
       "      <td>国五</td>\n",
       "      <td>阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>自动</td>\n",
       "      <td>郑州</td>\n",
       "      <td>3.36万公里</td>\n",
       "      <td>21.88</td>\n",
       "      <td>2年10个月</td>\n",
       "      <td>国五</td>\n",
       "      <td>阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>自动</td>\n",
       "      <td>佛山</td>\n",
       "      <td>6.06万公里</td>\n",
       "      <td>19.98</td>\n",
       "      <td>2年11个月</td>\n",
       "      <td>国五</td>\n",
       "      <td>阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  change-speed chepai-location kilometer  price shangpai-date standard  \\\n",
       "0           自动              郑州   4.83万公里  20.68        2年10个月       国五   \n",
       "1           自动              郑州   3.36万公里  21.88        2年10个月       国五   \n",
       "2           自动              佛山   6.06万公里  19.98        2年11个月       国五   \n",
       "\n",
       "                                  title  \n",
       "0  阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版  \n",
       "1  阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版  \n",
       "2  阿尔法·罗密欧-Stelvio 2018款 2.0T 200HP 精英版  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = []\n",
    "data_dir = 'data_new'\n",
    "# data_dir = 'data'\n",
    "num_csv = 641 + 1\n",
    "for i in range(num_csv):\n",
    "    try:\n",
    "        df.append(pd.read_csv(f'{data_dir}/car_info_requests_%06d.csv' % i))\n",
    "    except:\n",
    "        continue\n",
    "df = pd.concat(df,axis=0)\n",
    "df.dropna(axis=0,how='any',inplace=True)\n",
    "print(f'number of sample = {len(df)}')\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_title(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    df.reset_index(inplace=True)\n",
    "    df.loc[:,'title'] = df['title'].str.split('-',1,expand=True)[0]\n",
    "    all_titles = df['title'].unique().tolist()\n",
    "    title2idx = {\n",
    "        title: idx for idx,title in enumerate(all_titles)\n",
    "    }\n",
    "    save_json(title2idx,f'{data_dir}/car_type_refinement.json')\n",
    "\n",
    "    for index in df.index:\n",
    "        df.loc[index,'title'] = title2idx[df.loc[index,'title']]\n",
    "    df.loc[:,'title'] = df['title'].astype(int)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_shangpai_date(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    df.reset_index(inplace=True)\n",
    "    # print(df.index)\n",
    "    for index in df.index:\n",
    "        shangpai = df.loc[index,'shangpai-date']\n",
    "        # print(shangpai)\n",
    "        ans = 0\n",
    "        if \"年\" in shangpai:\n",
    "            year = int(re.search(r'(\\d+)年',shangpai).group()[0])\n",
    "            ans += year\n",
    "        if \"月\" in shangpai:\n",
    "            month = int(re.search(r'(\\d+)个月',shangpai).group()[0])\n",
    "            ans += (month/12.0)\n",
    "        df.loc[index,'shangpai-date'] = ans\n",
    "    df.loc[:,'shangpai-date'] = df['shangpai-date'].astype(float)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_change_speed(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    df.loc[df['change-speed']=='自动','change-speed'] = 0\n",
    "    df.loc[df['change-speed']=='手动','change-speed'] = 1\n",
    "    df.loc[\n",
    "        ~((df['change-speed']==0)|\n",
    "        (df['change-speed']==1))\n",
    "    ,'change-speed'] = 2\n",
    "    df['change-speed'] = df['change-speed'].astype(int)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_kilometer(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    df.loc[:,'kilometer'] = df['kilometer'].str.split('万',1,expand=True)[0].astype(float)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_price(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    df.loc[:,'price'] = df['price'].astype(float)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_standard(df_ori):\n",
    "    df = df_ori.copy()\n",
    "    conds = [\n",
    "        df['standard']=='国一',\n",
    "        df['standard']=='国二',\n",
    "        df['standard']=='国三',\n",
    "        df['standard']=='国四',\n",
    "        df['standard']=='国五',\n",
    "        df['standard']=='国六',\n",
    "    ]\n",
    "    tmp = conds[0]\n",
    "    for i in range(1,len(conds)):\n",
    "        tmp = (tmp) | (conds[i])\n",
    "    tmp = ~tmp\n",
    "    df.loc[tmp,'standard'] = 0\n",
    "\n",
    "    for i in range(len(conds)):\n",
    "        df.loc[conds[i],'standard'] = i+1\n",
    "    df.loc[:,'standard'] = df['standard'].astype(int)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>change-speed</th>\n",
       "      <th>kilometer</th>\n",
       "      <th>price</th>\n",
       "      <th>shangpai-date</th>\n",
       "      <th>standard</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>4.83</td>\n",
       "      <td>20.68</td>\n",
       "      <td>2.083333</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>3.36</td>\n",
       "      <td>21.88</td>\n",
       "      <td>2.083333</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>6.06</td>\n",
       "      <td>19.98</td>\n",
       "      <td>2.083333</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>3.53</td>\n",
       "      <td>20.30</td>\n",
       "      <td>3.666667</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   change-speed  kilometer  price  shangpai-date  standard  title\n",
       "0             0       4.83  20.68       2.083333         5      0\n",
       "1             0       3.36  21.88       2.083333         5      0\n",
       "2             0       6.06  19.98       2.083333         5      0\n",
       "3             0       3.53  20.30       3.666667         5      0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = clean_kilometer(df)\n",
    "df = clean_price(df)\n",
    "df = clean_change_speed(df)\n",
    "df = clean_shangpai_date(df)\n",
    "df = clean_standard(df)\n",
    "df = clean_title(df)\n",
    "df.drop(['chepai-location'],axis=1,inplace=True)\n",
    "df.drop(['index'],axis=1,inplace=True)\n",
    "df.drop(['level_0'],axis=1,inplace=True)\n",
    "df.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(f'{data_dir}/data.csv',index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_all = len(df)\n",
    "train = int(0.9*num_all)\n",
    "df_shuffle = df.sample(frac=1.0)\n",
    "df_train = df_shuffle.iloc[0:train,:].copy()\n",
    "df_test = df_shuffle.iloc[train:,:].copy()\n",
    "df_train.to_csv(f'{data_dir}/data_train.csv', index=None)\n",
    "df_test.to_csv(f'{data_dir}/data_test.csv', index=None)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "96425d339b0e6107dbb91cb157c2db64734def97b9748c3a4592256fc09ea454"
  },
  "kernelspec": {
   "display_name": "Python 3.6.13 ('pypots')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
